{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#!pip install sentencepiece\n",
    "#!pip install pyarrow==15.0.2\n",
    "#!pip install datasets\n",
    "#!pip install nlpaug\n",
    "#!pip install evaluate\n",
    "#!pip install optuna"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Naive Bayes Baseline"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "from sklearn.metrics import classification_report, confusion_matrix, balanced_accuracy_score, f1_score\n",
    "from sklearn.model_selection import train_test_split\n",
    "from sklearn.feature_extraction.text import TfidfVectorizer\n",
    "from sklearn.naive_bayes import MultinomialNB\n",
    "import hashlib\n",
    "from imblearn.over_sampling import RandomOverSampler\n",
    "\n",
    "BASEPATH = \"./drive/MyDrive/IsraelTransformer2\"\n",
    "\n",
    "\n",
    "def load_and_preprocess_data():\n",
    "    df_prev = pd.concat([pd.read_csv(f\"./drive/MyDrive/IsraelTransformer/ManualCoding{i}.csv\") for i in range(1, 4)], ignore_index=True)\n",
    "    df_prev = df_prev[df_prev[\"stance\"].isin([\"-1\", \"1\", \"0\"])]\n",
    "    label_map = {'1': 2, '-1': 0, '0': 1}\n",
    "    df_prev[\"stance\"] = df_prev[\"stance\"].map(lambda x: label_map[str(x)])\n",
    "    df_prev = df_prev.sample(frac=1, random_state=42).reset_index(drop=True)\n",
    "    df_prev[\"cid_entry\"] = df_prev[\"text\"].apply(lambda x: hashlib.md5(x.encode()).hexdigest())\n",
    "    df_prev.rename(columns={\"stance\": \"labels\"}, inplace=True)\n",
    "    df_new = pd.read_csv(f\"{BASEPATH}/df_final.csv\")\n",
    "    df_new[\"labels\"] = df_new[\"labels\"] + 1\n",
    "    df_prev = df_prev.loc[:,[\"cid_entry\",\"text\", \"labels\"]]\n",
    "    df_new = df_new.loc[:,[\"cid_entry\",\"text\", \"labels\"]]\n",
    "    df = pd.concat([df_prev, df_new], ignore_index=True)\n",
    "    df = df.sample(frac=1, random_state=42).reset_index(drop=True)\n",
    "\n",
    "    # Split the data\n",
    "    # train_test_split returns a tuple of arrays, we need to unpack it correctly\n",
    "    train_df, temp_df = train_test_split(df, test_size=0.2, stratify=df['labels'], random_state=42)\n",
    "    val_df, test_df = train_test_split(temp_df, test_size=0.5, stratify=temp_df['labels'], random_state=42)\n",
    "\n",
    "    # Implement oversampling for the training data\n",
    "    oversampler = RandomOverSampler(random_state=42)\n",
    "    X_train_resampled, y_train_resampled = oversampler.fit_resample(\n",
    "        train_df[['cid_entry', 'text']], train_df['labels']\n",
    "    )\n",
    "    train_df_resampled = pd.DataFrame(X_train_resampled, columns=['cid_entry', 'text'])\n",
    "    train_df_resampled['labels'] = y_train_resampled\n",
    "\n",
    "    print(\"Original class distribution:\")\n",
    "    print(train_df['labels'].value_counts(normalize=True))\n",
    "    print(\"\\nResampled class distribution:\")\n",
    "    print(train_df_resampled['labels'].value_counts(normalize=True))\n",
    "\n",
    "    return train_df_resampled, val_df, test_df\n",
    "\n",
    "def train_and_evaluate_naive_bayes(df):\n",
    "    train_df, val_df, test_df = df\n",
    "    # Vectorize the text data\n",
    "    vectorizer = TfidfVectorizer(max_features=5000)\n",
    "    X_train = vectorizer.fit_transform(train_df['text'])\n",
    "    X_val = vectorizer.transform(val_df['text'])\n",
    "    X_test = vectorizer.transform(test_df['text'])\n",
    "\n",
    "    # Apply oversampling to the training data\n",
    "    oversampler = RandomOverSampler(random_state=42)\n",
    "    X_train_resampled, y_train_resampled = oversampler.fit_resample(X_train, train_df['labels'])\n",
    "\n",
    "    print(\"Original class distribution:\")\n",
    "    print(train_df['labels'].value_counts(normalize=True))\n",
    "    print(\"\\nResampled class distribution:\")\n",
    "    print(pd.Series(y_train_resampled).value_counts(normalize=True))\n",
    "\n",
    "    # Train the Naive Bayes classifier on the resampled data\n",
    "    clf = MultinomialNB()\n",
    "    clf.fit(X_train_resampled, y_train_resampled)\n",
    "\n",
    "    # Evaluate on validation set\n",
    "    val_preds = clf.predict(X_val)\n",
    "    val_results = {\n",
    "        'balanced_accuracy': balanced_accuracy_score(val_df['labels'], val_preds),\n",
    "        'macro_f1': f1_score(val_df['labels'], val_preds, average='macro'),\n",
    "        'weighted_f1': f1_score(val_df['labels'], val_preds, average='weighted')\n",
    "    }\n",
    "\n",
    "    print(\"Validation Results for Naive Bayes:\")\n",
    "    for key, value in val_results.items():\n",
    "        print(f\"{key}: {value:.4f}\")\n",
    "\n",
    "    # Final evaluation on the test set\n",
    "    test_preds = clf.predict(X_test)\n",
    "\n",
    "    print(\"Test Set Classification Report for Naive Bayes:\")\n",
    "    print(classification_report(test_df['labels'], test_preds, digits=3))\n",
    "\n",
    "    # Save the classification report\n",
    "    report = classification_report(test_df['labels'], test_preds, digits=3, output_dict=True)\n",
    "    report_df = pd.DataFrame(report).transpose()\n",
    "    report_df.to_csv(f'{BASEPATH}/classification_report_naive_bayes_oversampled.csv', index=True)\n",
    "\n",
    "    # Save the confusion matrix\n",
    "    cm = confusion_matrix(test_df['labels'], test_preds)\n",
    "    cm_df = pd.DataFrame(cm, columns=['Predicted Palestine', 'Predicted Neutral', 'Predicted Israel'],\n",
    "                         index=['Actual Palestine', 'Actual Neutral', 'Actual Israel'])\n",
    "    cm_df.to_csv(f'{BASEPATH}/confusion_matrix_naive_bayes_oversampled.csv', index=True)\n",
    "\n",
    "def main():\n",
    "    df = load_and_preprocess_data()\n",
    "    print(\"Training and evaluating Naive Bayes classifier with oversampling\")\n",
    "    train_and_evaluate_naive_bayes(df)\n",
    "    print(\"Training and evaluation complete for Naive Bayes baseline with oversampling.\")\n",
    "\n",
    "if __name__ == \"__main__\":\n",
    "    main()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Train Transformer"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "from sklearn.metrics import classification_report, confusion_matrix, balanced_accuracy_score, f1_score\n",
    "from sklearn.model_selection import train_test_split\n",
    "from datasets import Dataset\n",
    "import evaluate\n",
    "import os\n",
    "import torch\n",
    "import matplotlib.pyplot as plt\n",
    "import shutil\n",
    "import hashlib\n",
    "import optuna\n",
    "from transformers import (TrainerCallback, AutoModelForSequenceClassification, AutoTokenizer,\n",
    "                          TrainingArguments, Trainer, EarlyStoppingCallback, get_linear_schedule_with_warmup, AdamW, AutoConfig)\n",
    "from imblearn.over_sampling import RandomOverSampler\n",
    "\n",
    "BASEPATH = \"./drive/MyDrive/IsraelTransformer2\"\n",
    "model_name = \"xlm-roberta-large\"\n",
    "RSEED = 0\n",
    "\n",
    "class CustomCallback(TrainerCallback):\n",
    "    def __init__(self):\n",
    "        super().__init__()\n",
    "        self.train_losses = []\n",
    "        self.val_losses = []\n",
    "\n",
    "    def on_log(self, args, state, control, logs=None, **kwargs):\n",
    "        if 'loss' in logs:\n",
    "            self.train_losses.append(logs['loss'])\n",
    "        if 'eval_loss' in logs:\n",
    "            self.val_losses.append(logs['eval_loss'])\n",
    "\n",
    "def load_and_preprocess_data():\n",
    "    df_prev = pd.concat([pd.read_csv(f\"./drive/MyDrive/IsraelTransformer/ManualCoding{i}.csv\") for i in range(1, 4)], ignore_index=True)\n",
    "    df_prev = df_prev[df_prev[\"stance\"].isin([\"-1\", \"1\", \"0\"])]\n",
    "    label_map = {'1': 2, '-1': 0, '0': 1}\n",
    "    df_prev[\"stance\"] = df_prev[\"stance\"].map(lambda x: label_map[str(x)])\n",
    "    df_prev = df_prev.sample(frac=1, random_state=RSEED).reset_index(drop=True)\n",
    "    df_prev[\"cid_entry\"] = df_prev[\"text\"].apply(lambda x: hashlib.md5(x.encode()).hexdigest())\n",
    "    df_prev.rename(columns={\"stance\": \"labels\"}, inplace=True)\n",
    "    df_new = pd.read_csv(f\"{BASEPATH}/df_final.csv\")\n",
    "    df_new[\"labels\"] = df_new[\"labels\"] + 1\n",
    "    df_prev = df_prev.loc[:,[\"cid_entry\",\"text\", \"labels\"]]\n",
    "    df_new = df_new.loc[:,[\"cid_entry\",\"text\", \"labels\"]]\n",
    "    df = pd.concat([df_prev, df_new], ignore_index=True)\n",
    "    df = df.sample(frac=1, random_state=RSEED).reset_index(drop=True)\n",
    "\n",
    "    # Split the data\n",
    "    train_df, temp_df = train_test_split(df, test_size=0.2, stratify=df['labels'], random_state=RSEED)\n",
    "    val_df, test_df = train_test_split(temp_df, test_size=0.5, stratify=temp_df['labels'], random_state=RSEED)\n",
    "\n",
    "    # Implement oversampling for the training data\n",
    "    oversampler = RandomOverSampler(random_state=RSEED)\n",
    "    X_train_resampled, y_train_resampled = oversampler.fit_resample(\n",
    "        train_df[['cid_entry', 'text']], train_df['labels']\n",
    "    )\n",
    "    train_df_resampled = pd.DataFrame(X_train_resampled, columns=['cid_entry', 'text'])\n",
    "    train_df_resampled['labels'] = y_train_resampled\n",
    "\n",
    "    print(\"Original class distribution:\")\n",
    "    print(train_df['labels'].value_counts(normalize=True))\n",
    "    print(\"\\nResampled class distribution:\")\n",
    "    print(train_df_resampled['labels'].value_counts(normalize=True))\n",
    "\n",
    "    return train_df_resampled, val_df, test_df\n",
    "\n",
    "def compute_metrics(eval_pred):\n",
    "    logits, labels = eval_pred\n",
    "    predictions = np.argmax(logits, axis=-1)\n",
    "    return {\n",
    "        'balanced_accuracy': balanced_accuracy_score(labels, predictions),\n",
    "        'macro_f1': f1_score(labels, predictions, average='macro'),\n",
    "        'weighted_f1': f1_score(labels, predictions, average='weighted')\n",
    "    }\n",
    "\n",
    "def prepare_datasets(train_df, val_df, test_df, tokenizer):\n",
    "    train_dataset = Dataset.from_pandas(train_df)\n",
    "    val_dataset = Dataset.from_pandas(val_df)\n",
    "    test_dataset = Dataset.from_pandas(test_df)\n",
    "\n",
    "    def tokenize_and_format(examples):\n",
    "        tokenized = tokenizer(examples['text'], truncation=True, padding='max_length', max_length=128, return_tensors=\"pt\")\n",
    "        return {\n",
    "            'input_ids': tokenized['input_ids'].squeeze(),\n",
    "            'attention_mask': tokenized['attention_mask'].squeeze(),\n",
    "            'labels': torch.tensor(examples['labels'])\n",
    "        }\n",
    "\n",
    "    train_dataset = train_dataset.map(tokenize_and_format, batched=True, batch_size=8, remove_columns=['cid_entry', 'text'])\n",
    "    val_dataset = val_dataset.map(tokenize_and_format, batched=True, batch_size=8, remove_columns=['cid_entry', 'text'])\n",
    "    test_dataset = test_dataset.map(tokenize_and_format, batched=True, batch_size=8, remove_columns=['cid_entry', 'text'])\n",
    "\n",
    "    train_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])\n",
    "    val_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])\n",
    "    test_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])\n",
    "\n",
    "    print(\"Train dataset size:\", len(train_dataset))\n",
    "    print(\"Validation dataset size:\", len(val_dataset))\n",
    "    print(\"Test dataset size:\", len(test_dataset))\n",
    "\n",
    "    return train_dataset, val_dataset, test_dataset\n",
    "def objective(trial):\n",
    "    global best_f1, best_model_path, best_results, best_report, best_cm\n",
    "\n",
    "    config = AutoConfig.from_pretrained(model_name)\n",
    "    config.num_labels = 3\n",
    "\n",
    "    # Hyperparameters to tune\n",
    "    config.hidden_dropout_prob = trial.suggest_float(\"hidden_dropout_prob\", 0.1, 0.5)\n",
    "    learning_rate = trial.suggest_loguniform(\"learning_rate\", 1e-6, 1e-4)\n",
    "    weight_decay = trial.suggest_loguniform(\"weight_decay\", 1e-3, 1e-1)\n",
    "    warmup_ratio = trial.suggest_uniform(\"warmup_ratio\", 0.05, 0.2)\n",
    "    num_frozen_layers = trial.suggest_int(\"num_frozen_layers\", 20, 24)\n",
    "\n",
    "    model = AutoModelForSequenceClassification.from_pretrained(model_name, config=config)\n",
    "\n",
    "    # Freeze layers\n",
    "    for layer in model.roberta.encoder.layer[:num_frozen_layers]:\n",
    "        for param in layer.parameters():\n",
    "            param.requires_grad = False\n",
    "    for param in model.roberta.embeddings.parameters():\n",
    "        param.requires_grad = False\n",
    "\n",
    "    training_args = TrainingArguments(\n",
    "        output_dir=f'./results_{model_name.split(\"/\")[-1]}_{trial.number}',\n",
    "        num_train_epochs=20,\n",
    "        per_device_train_batch_size=8,\n",
    "        per_device_eval_batch_size=8,\n",
    "        warmup_ratio=warmup_ratio,\n",
    "        learning_rate=learning_rate,\n",
    "        weight_decay=weight_decay,\n",
    "        logging_dir=f'./logs_{model_name.split(\"/\")[-1]}_{trial.number}',\n",
    "        logging_steps=5,\n",
    "        evaluation_strategy='steps',\n",
    "        eval_steps=100,\n",
    "        save_strategy='steps',\n",
    "        save_steps=100,\n",
    "        load_best_model_at_end=True,\n",
    "        metric_for_best_model='macro_f1',\n",
    "        greater_is_better=True,\n",
    "        fp16=True,\n",
    "        gradient_accumulation_steps=2,\n",
    "        max_grad_norm=1.0,\n",
    "    )\n",
    "\n",
    "    lr_scheduler = get_linear_schedule_with_warmup(\n",
    "        optimizer=AdamW(model.parameters(), lr=training_args.learning_rate),\n",
    "        num_warmup_steps=int(warmup_ratio * (len(train_dataset) / training_args.per_device_train_batch_size) * training_args.num_train_epochs),\n",
    "        num_training_steps=(len(train_dataset) / training_args.per_device_train_batch_size) * training_args.num_train_epochs\n",
    "    )\n",
    "\n",
    "    custom_callback = CustomCallback()\n",
    "\n",
    "    trainer = Trainer(\n",
    "        model=model,\n",
    "        args=training_args,\n",
    "        train_dataset=train_dataset,\n",
    "        eval_dataset=val_dataset,\n",
    "        compute_metrics=compute_metrics,\n",
    "        callbacks=[EarlyStoppingCallback(early_stopping_patience=5), custom_callback],\n",
    "        optimizers=(AdamW(model.parameters(), lr=learning_rate), lr_scheduler),\n",
    "    )\n",
    "\n",
    "    trainer.train()\n",
    "\n",
    "    # Evaluate on validation set\n",
    "    val_results = trainer.evaluate()\n",
    "    val_f1 = val_results['eval_macro_f1']\n",
    "\n",
    "    # Also get test results for final reporting\n",
    "    test_results = trainer.predict(test_dataset)\n",
    "    test_preds = np.argmax(test_results.predictions, axis=-1)\n",
    "    test_f1 = f1_score(test_dataset['labels'], test_preds, average='macro')\n",
    "\n",
    "    print(f\"Trial {trial.number} - Validation Macro F1: {val_f1:.4f}, Test Macro F1: {test_f1:.4f}\")\n",
    "\n",
    "    if val_f1 > best_f1:\n",
    "        print(f\"New best model found! Validation F1: {val_f1:.4f}\")\n",
    "        best_f1 = val_f1\n",
    "\n",
    "        # Save the new best model\n",
    "        new_best_model_path = f'{BASEPATH}/best_model_{model_name.split(\"/\")[-1]}_{trial.number}'\n",
    "        trainer.save_model(new_best_model_path)\n",
    "\n",
    "        # Delete the old best model if it exists\n",
    "        if best_model_path and os.path.exists(best_model_path):\n",
    "            shutil.rmtree(best_model_path)\n",
    "\n",
    "        best_model_path = new_best_model_path\n",
    "\n",
    "        # Save best results\n",
    "        best_results = {\n",
    "            'test_f1': test_f1,\n",
    "            'val_results': val_results,\n",
    "            'params': trial.params,\n",
    "            'train_losses': custom_callback.train_losses,\n",
    "            'val_losses': custom_callback.val_losses\n",
    "        }\n",
    "\n",
    "        # Generate and save classification report\n",
    "        best_report = classification_report(test_dataset['labels'], test_preds, digits=3, output_dict=True)\n",
    "\n",
    "        # Generate and save confusion matrix\n",
    "        best_cm = confusion_matrix(test_dataset['labels'], test_preds)\n",
    "\n",
    "    # Clean up to save disk space\n",
    "    if os.path.exists(training_args.output_dir):\n",
    "        shutil.rmtree(training_args.output_dir)\n",
    "    if os.path.exists(training_args.logging_dir):\n",
    "        shutil.rmtree(training_args.logging_dir)\n",
    "\n",
    "    return val_f1\n",
    "\n",
    "def run_hyperparameter_tuning():\n",
    "    global best_f1, best_model_path, best_results, best_report, best_cm\n",
    "    best_f1 = 0.0\n",
    "    best_model_path = None\n",
    "    best_results = None\n",
    "    best_report = None\n",
    "    best_cm = None\n",
    "\n",
    "    study = optuna.create_study(direction=\"maximize\")\n",
    "    study.optimize(objective, n_trials=30)\n",
    "\n",
    "    print(\"Best trial:\")\n",
    "    trial = study.best_trial\n",
    "    print(\"  Value: \", trial.value)\n",
    "    print(\"  Params: \")\n",
    "    for key, value in trial.params.items():\n",
    "        print(\"    {}: {}\".format(key, value))\n",
    "\n",
    "    return study.best_trial\n",
    "\n",
    "def main():\n",
    "    global train_dataset, val_dataset, test_dataset\n",
    "    train_df, val_df, test_df = load_and_preprocess_data()\n",
    "    tokenizer = AutoTokenizer.from_pretrained(model_name)\n",
    "    train_dataset, val_dataset, test_dataset = prepare_datasets(train_df, val_df, test_df, tokenizer)\n",
    "\n",
    "    print(\"Starting hyperparameter tuning...\")\n",
    "    best_trial = run_hyperparameter_tuning()\n",
    "\n",
    "    print(f\"Best model saved at: {best_model_path}\")\n",
    "    print(f\"Best Test Macro F1: {best_f1:.4f}\")\n",
    "\n",
    "    # Save the best results\n",
    "    print(\"Saving best model results...\")\n",
    "\n",
    "    # Save validation results\n",
    "    print(f\"Validation Results for {model_name}:\")\n",
    "    for key, value in best_results['val_results'].items():\n",
    "        print(f\"{key}: {value:.4f}\")\n",
    "\n",
    "    # Save the classification report\n",
    "    report_df = pd.DataFrame(best_report).transpose()\n",
    "    report_df.to_csv(f'{BASEPATH}/classification_report_{model_name.split(\"/\")[-1]}_best.csv', index=True)\n",
    "\n",
    "    # Save the confusion matrix\n",
    "    cm_df = pd.DataFrame(best_cm, columns=['Predicted Palestine', 'Predicted Neutral', 'Predicted Israel'],\n",
    "                         index=['Actual Palestine', 'Actual Neutral', 'Actual Israel'])\n",
    "    cm_df.to_csv(f'{BASEPATH}/confusion_matrix_{model_name.split(\"/\")[-1]}_best.csv', index=True)\n",
    "\n",
    "    # Plot and save learning curves\n",
    "    plt.figure(figsize=(10, 5))\n",
    "    plt.plot(best_results['train_losses'], label='Training Loss')\n",
    "    plt.plot(best_results['val_losses'], range(0, len(best_results['val_losses']) * 100, 100), label='Validation Loss')\n",
    "    plt.title(f'Learning Curves - {model_name} (Best Model)')\n",
    "    plt.xlabel('Steps')\n",
    "    plt.ylabel('Loss')\n",
    "    plt.legend()\n",
    "    plt.savefig(f'{BASEPATH}/learning_curves_{model_name.split(\"/\")[-1]}_best.png')\n",
    "    plt.close()\n",
    "\n",
    "    # Save best hyperparameters\n",
    "    with open(f'{BASEPATH}/best_hyperparameters_{model_name.split(\"/\")[-1]}.txt', 'w') as f:\n",
    "        for key, value in best_trial.params.items():\n",
    "            f.write(f\"{key}: {value}\\n\")\n",
    "\n",
    "    print(\"Training and evaluation complete. All results for the best model have been saved.\")\n",
    "\n",
    "if __name__ == \"__main__\":\n",
    "    main()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "import torch\n",
    "import os\n",
    "import glob\n",
    "from tqdm import tqdm\n",
    "from transformers import AutoModelForSequenceClassification, AutoTokenizer, AutoConfig\n",
    "\n",
    "BASEPATH = \"./drive/MyDrive/IsraelTransformer2\"\n",
    "model_name = \"xlm-roberta-large\"\n",
    "best_model_path = f'{BASEPATH}/best_model_xlm-roberta-large_21'  # Path to the best model\n",
    "\n",
    "# Create Labelling folder if it doesn't exist\n",
    "labelling_folder = \"./Labelling\"\n",
    "os.makedirs(labelling_folder, exist_ok=True)\n",
    "\n",
    "# Load the best model and tokenizer\n",
    "config = AutoConfig.from_pretrained(best_model_path)\n",
    "model = AutoModelForSequenceClassification.from_pretrained(best_model_path, config=config)\n",
    "tokenizer = AutoTokenizer.from_pretrained(model_name)  # Load tokenizer from the original model name\n",
    "\n",
    "# Move model to GPU if available\n",
    "device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n",
    "model.to(device)\n",
    "model.eval()\n",
    "\n",
    "def tokenize_and_encode(texts, tokenizer, max_length=128):\n",
    "    return tokenizer(texts, padding=True, truncation=True, max_length=max_length, return_tensors=\"pt\")\n",
    "\n",
    "def predict_stance(model, encoded_texts, device):\n",
    "    input_ids = encoded_texts['input_ids'].to(device)\n",
    "    attention_mask = encoded_texts['attention_mask'].to(device)\n",
    "\n",
    "    with torch.no_grad():\n",
    "        outputs = model(input_ids, attention_mask=attention_mask)\n",
    "        predictions = torch.argmax(outputs.logits, dim=1)\n",
    "\n",
    "    return predictions.cpu().numpy()\n",
    "\n",
    "def process_batch(batch_df, model, tokenizer, device):\n",
    "    encoded_texts = tokenize_and_encode(batch_df['text'].tolist(), tokenizer)\n",
    "    predictions = predict_stance(model, encoded_texts, device)\n",
    "\n",
    "    # Map predictions to labels\n",
    "    label_map = {0: 'Palestine', 1: 'Neutral', 2: 'Israel'}\n",
    "    batch_df['predicted_stance'] = [label_map[pred] for pred in predictions]\n",
    "\n",
    "    return batch_df\n",
    "\n",
    "def main():\n",
    "    # Load the data to label\n",
    "    data_to_label = pd.read_csv(\"./drive/MyDrive/IsraelTransformer2/matching_posts.csv.gz\")\n",
    "    data_to_label = data_to_label.loc[:,[\"cid_entry\",\"text\"]]\n",
    "\n",
    "    batch_size = 1_000\n",
    "    num_batches = len(data_to_label) // batch_size + (1 if len(data_to_label) % batch_size != 0 else 0)\n",
    "\n",
    "    all_labelled_dfs = []\n",
    "\n",
    "    for i in tqdm(range(num_batches), desc=\"Processing batches\"):\n",
    "        start_idx = i * batch_size\n",
    "        end_idx = min((i + 1) * batch_size, len(data_to_label))\n",
    "\n",
    "        batch_df = data_to_label.iloc[start_idx:end_idx].copy()\n",
    "\n",
    "        # Check if this batch has already been labelled\n",
    "        batch_filename = f\"{labelling_folder}/labelled_batch_{i+1}.csv.gz\"\n",
    "        if os.path.exists(batch_filename):\n",
    "            print(f\"Batch {i+1} already labelled. Skipping...\")\n",
    "            labelled_batch = pd.read_csv(batch_filename, compression='gzip')\n",
    "            all_labelled_dfs.append(labelled_batch)\n",
    "            continue\n",
    "\n",
    "        # Process the batch\n",
    "        labelled_batch = process_batch(batch_df, model, tokenizer, device)\n",
    "\n",
    "        # Save the labelled batch\n",
    "        labelled_batch.to_csv(batch_filename, index=False, compression='gzip')\n",
    "        all_labelled_dfs.append(labelled_batch)\n",
    "\n",
    "    # Concatenate all labelled dataframes\n",
    "    final_df = pd.concat(all_labelled_dfs, ignore_index=True)\n",
    "\n",
    "    # Export the final concatenated dataframe\n",
    "    final_df.to_csv(f\"{labelling_folder}/all_labelled_data.csv.gz\", index=False, compression='gzip')\n",
    "\n",
    "    print(\"Labelling complete. All data has been processed and exported.\")\n",
    "\n",
    "if __name__ == \"__main__\":\n",
    "    main()"
   ]
  }
 ],
 "metadata": {
  "language_info": {
   "name": "python"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
